# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#	           ChangeSet	1.1063.1.11 -> 1.1063.1.12
#	include/linux/mmzone.h	1.10    -> 1.11   
#	     mm/page_alloc.c	1.57    -> 1.58   
#	         init/main.c	1.28    -> 1.29   
#	        mm/bootmem.c	1.10    -> 1.11   
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 03/08/12	agruen@suse.de	1.1063.4.24
# [PATCH] More steal_locks fixes: we should be in full LSB compliance now
# 
# By Andreas and Hurbert Xu
# 
# Index: linux-2.4.22-rc2.orig/fs/exec.c
# ===================================================================
# --------------------------------------------
# 03/08/12	bjorn.helgaas@hp.com	1.1069.1.10
# ia64: Fix check for binutils that supports "hint" instructions.
# --------------------------------------------
# 03/08/12	bjorn.helgaas@hp.com	1.1074
# Merge hp.com:/home/helgaas/linux/to-marcelo-2.4
# into hp.com:/home/helgaas/linux/linux-ia64-2.4
# --------------------------------------------
# 03/08/12	bjorn.helgaas@hp.com	1.1075
# Merge hp.com:/home/helgaas/linux/ia64-extras
# into hp.com:/home/helgaas/linux/linux-ia64-2.4
# --------------------------------------------
# 03/08/12	steiner@SGI.com	1.1063.1.12
# discontig/NUMA support
# 
# Attached is the patch for discontig memory for 2.4.21. This patch
# has been tested on the ZX1 & NEC platforms & appears to work ok. It 
# also works on SN2 but there are additional patches (unrelated to 
# discontig) that at still needed in 2.4.21.
# --------------------------------------------
#
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h	Wed Oct  8 09:09:59 2003
+++ b/include/linux/mmzone.h	Wed Oct  8 09:09:59 2003
@@ -8,6 +8,12 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/wait.h>
+#ifdef CONFIG_DISCONTIGMEM
+#include <asm/numnodes.h>
+#endif
+#ifndef MAX_NUMNODES
+#define MAX_NUMNODES 1
+#endif
 
 /*
  * Free memory management - zoned buddy allocator.
@@ -110,7 +116,7 @@
  * footprint of this construct is very small.
  */
 typedef struct zonelist_struct {
-	zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited
+	zone_t * zones [MAX_NUMNODES*MAX_NR_ZONES+1]; // NULL delimited
 } zonelist_t;
 
 #define GFP_ZONEMASK	0x0f
@@ -144,8 +150,8 @@
 extern int numnodes;
 extern pg_data_t *pgdat_list;
 
-#define memclass(pgzone, classzone)	(((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \
-			&& ((pgzone) <= (classzone)))
+#define memclass(pgzone, classzone)            (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \
+((classzone) - (classzone)->zone_pgdat->node_zones))
 
 /*
  * The following two are not meant for general usage. They are here as
@@ -212,6 +218,18 @@
 #define for_each_zone(zone) \
 	for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
 
+#ifdef CONFIG_NUMA
+#define MAX_NR_MEMBLKS  BITS_PER_LONG /* Max number of Memory Blocks */
+#include <asm/topology.h>
+#else /* !CONFIG_NUMA */
+#define MAX_NR_MEMBLKS  1
+#endif /* CONFIG_NUMA */
+
+/* Returns the number of the current Node. */
+
+#ifndef CONFIG_NUMA
+#define numa_node_id()          (__cpu_to_node(smp_processor_id()))
+#endif
 
 #ifndef CONFIG_DISCONTIGMEM
 
diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c	Wed Oct  8 09:09:59 2003
+++ b/init/main.c	Wed Oct  8 09:09:59 2003
@@ -290,6 +290,7 @@
 
 
 extern void setup_arch(char **);
+extern void __init build_all_zonelists(void);
 extern void cpu_idle(void);
 
 unsigned long wait_init_idle;
@@ -360,6 +361,7 @@
 	lock_kernel();
 	printk(linux_banner);
 	setup_arch(&command_line);
+	build_all_zonelists();
 	printk("Kernel command line: %s\n", saved_command_line);
 	parse_options(command_line);
 	trap_init();
diff -Nru a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c	Wed Oct  8 09:09:59 2003
+++ b/mm/bootmem.c	Wed Oct  8 09:09:59 2003
@@ -49,8 +49,24 @@
 	bootmem_data_t *bdata = pgdat->bdata;
 	unsigned long mapsize = ((end - start)+7)/8;
 
-	pgdat->node_next = pgdat_list;
-	pgdat_list = pgdat;
+
+	/*
+	 * sort pgdat_list so that the lowest one comes first,
+	 * which makes alloc_bootmem_low_pages work as desired.
+	 */
+	if (!pgdat_list || pgdat_list->node_start_paddr > pgdat->node_start_paddr) {
+		pgdat->node_next = pgdat_list;
+		pgdat_list = pgdat;
+	} else {
+		pg_data_t *tmp = pgdat_list;
+		while (tmp->node_next) {
+			if (tmp->node_next->node_start_paddr > pgdat->node_start_paddr)
+				break;
+			tmp = tmp->node_next;
+		}
+		pgdat->node_next = tmp->node_next;
+		tmp->node_next = pgdat;
+	}
 
 	mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
 	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
@@ -259,16 +275,16 @@
 	if (!bdata->node_bootmem_map) BUG();
 
 	count = 0;
+	page = virt_to_page(phys_to_virt(bdata->node_boot_start));
 	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
 	for (i = find_first_zero_bit(bdata->node_bootmem_map, idx);
 	     i < idx;
 	     i = find_next_zero_bit(bdata->node_bootmem_map, idx, i + 1))
 	{
-		page = pgdat->node_mem_map + i;
 		count++;
-		ClearPageReserved(page);
-		set_page_count(page, 1);
-		__free_page(page);
+		ClearPageReserved(page+i);
+		set_page_count(page+i, 1);
+		__free_page(page+i);
 	}
 	total += count;
 
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c	Wed Oct  8 09:09:59 2003
+++ b/mm/page_alloc.c	Wed Oct  8 09:09:59 2003
@@ -586,13 +586,44 @@
 /*
  * Builds allocation fallback zone lists.
  */
-static inline void build_zonelists(pg_data_t *pgdat)
+static int __init build_zonelists_node(pg_data_t *pgdat, zonelist_t *zonelist, int j, int k)
 {
-	int i, j, k;
+	zone_t *zone;
+	switch (k) {
+	default:
+		BUG();
+		/*
+		 * fallthrough:
+		 */
+	case ZONE_HIGHMEM:
+		zone = pgdat->node_zones + ZONE_HIGHMEM;
+		if (zone->memsize) {
+#ifndef CONFIG_HIGHMEM
+			BUG();
+#endif
+			zonelist->zones[j++] = zone;
+		}
+	case ZONE_NORMAL:
+		zone = pgdat->node_zones + ZONE_NORMAL;
+		if (zone->memsize)
+			zonelist->zones[j++] = zone;
+	case ZONE_DMA:
+		zone = pgdat->node_zones + ZONE_DMA;
+		if (zone->memsize)
+			zonelist->zones[j++] = zone;
+	}
+
+	return j;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
 
+	local_node = pgdat->node_id;
+	printk("Building zonelist for node : %d\n", local_node);
 	for (i = 0; i <= GFP_ZONEMASK; i++) {
 		zonelist_t *zonelist;
-		zone_t *zone;
 
 		zonelist = pgdat->node_zonelists + i;
 		memset(zonelist, 0, sizeof(*zonelist));
@@ -604,33 +635,32 @@
 		if (i & __GFP_DMA)
 			k = ZONE_DMA;
 
-		switch (k) {
-			default:
-				BUG();
-			/*
-			 * fallthrough:
-			 */
-			case ZONE_HIGHMEM:
-				zone = pgdat->node_zones + ZONE_HIGHMEM;
-				if (zone->memsize) {
-#ifndef CONFIG_HIGHMEM
-					BUG();
-#endif
-					zonelist->zones[j++] = zone;
-				}
-			case ZONE_NORMAL:
-				zone = pgdat->node_zones + ZONE_NORMAL;
-				if (zone->memsize)
-					zonelist->zones[j++] = zone;
-			case ZONE_DMA:
-				zone = pgdat->node_zones + ZONE_DMA;
-				if (zone->memsize)
-					zonelist->zones[j++] = zone;
-		}
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+ 		for (node = local_node + 1; node < numnodes; node++)
+ 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ 		for (node = 0; node < local_node; node++)
+ 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ 
 		zonelist->zones[j++] = NULL;
 	} 
 }
 
+void __init build_all_zonelists(void)
+{
+	int i;
+
+	for(i = 0 ; i < numnodes ; i++)
+		build_zonelists(NODE_DATA(i));
+}
+
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
@@ -742,7 +772,7 @@
 			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
 	}
 	*gmap = pgdat->node_mem_map = lmem_map;
-	pgdat->node_size = totalpages;
+	pgdat->node_size = 0;
 	pgdat->node_start_paddr = zone_start_paddr;
 	pgdat->node_start_mapnr = (lmem_map - mem_map);
 	pgdat->nr_zones = 0;
@@ -766,6 +796,7 @@
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 		zone->need_balance = 0;
+		pgdat->node_size += realsize;
 		if (!size)
 			continue;
 
@@ -850,7 +881,6 @@
 			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
 		}
 	}
-	build_zonelists(pgdat);
 }
 
 void __init free_area_init(unsigned long *zones_size)